# -*- coding: utf-8 -*-

#【案例4-1】检查北京PM10数据集中缺失值
import pandas as pd
df = pd.read_csv("D:/DataAnalysis/Chapter04Data/BJPM10.csv")
print(df.info())
print(df.isnull().sum())


#【案例4-2】删除北京PM10数据集中缺失值
import pandas as pd
df = pd.read_csv("D:/DataAnalysis/Chapter04Data/BJPM10.csv")
print(df.head(4))
df.dropna(how = 'any', inplace = True)
print(df.head(4))

#删除列数据
df.dropna(axis = 1, how = 'any',inplace = True)
print(df.head(4))


#【案例4-3】替换北京PM10数据集中缺失值
import pandas as pd
df = pd.read_csv("D:/DataAnalysis/Chapter04Data/BJPM10.csv")
df.fillna(0, inplace = True)
print(df.head(4))

#将“奥体中心”列中的NaN值替换为0
import pandas as pd
df = pd.read_csv("D:/DataAnalysis/Chapter04Data/BJPM10.csv")
df.fillna({"奥体中心":0}, inplace = True)
print(df.head(4))


#【案例4-4】填充北京PM10数据集中缺失值
import pandas as pd
df = pd.read_csv("D:/DataAnalysis/Chapter04Data/BJPM10.csv")
#使用缺失值的下一行数值填充
df.fillna(method = "backfill", inplace = True)
print(df.head(4))


#【案例4-5】找出考勤打卡数据集中重复数据
import pandas as pd
df = pd.read_excel("D:/DataAnalysis/Chapter04Data/Daily.xlsx")
#统计人员编号列中每个姓名出现的次数
print(df["人员编号"].value_counts())


#【案例4-6】删除考勤打卡数据集中重复数据
import pandas as pd
df = pd.read_excel("D:/DataAnalysis/Chapter04Data/Daily.xlsx")
#指定从“人员编号”列中查找重复项删除
df.drop_duplicates(subset = "人员编号", inplace = True)
print(df.head(4))

import pandas as pd
df = pd.read_excel("D:/DataAnalysis/Chapter04Data/Daily.xlsx")
#保留最后一行数据
df.drop_duplicates(subset = "人员编号",keep = "last", inplace = True)
print(df.head(4))


#【案例4-7】检测邮政编码数据集中异常值
import pandas as pd
df = pd.read_excel("D:/DataAnalysis/Chapter04Data/Zipcode.xlsx")
#设定邮政编码筛选条件，将筛选出的数据存入df_outliers中
df_outliers = df[(df["邮政编码"]<100000) | (df["邮政编码"]>109999)]
print(df_outliers.head(4))


#【案例4-8】处理邮政编码数据集中异常值
import pandas as pd
df = pd.read_excel("D:/DataAnalysis/Chapter04Data/Zipcode.xlsx")
#用数字100000替换异常值
df.replace(df[(df["邮政编码"]<100000) | (df["邮政编码"]>109999)]["邮政编码"].tolist(),
              100000, inplace=True)
print(df.head(4))


#【案例4-9】转换考勤打卡数据集数据类型
import pandas as pd
df = pd.read_csv("D:/DataAnalysis/Chapter04Data/DailyChange.csv")
print(df.info())
#astype是将原数据列的类型改变后返回一个拷贝，因此需要再赋值给原数据列
df["人员编号"] = df["人员编号"].astype(str)
df["刷卡日期"] = df["刷卡日期"].astype("datetime64")
df["刷卡时间"] = df["刷卡时间"].astype("datetime64")
print(df.info())


#【案例4-10】重命名邮政编码数据集索引
import pandas as pd
df = pd.read_excel("D:/DataAnalysis/Chapter04Data/ZipcodeIndex.xlsx")
print(df.head(4))
#修改行索引为从1开始的整数
df.index = [1,2,3,4,5,6,7]
#修改列索引为所在区域和邮政编码
df.columns = ["所在区域","邮政编码"]
print(df.head(4))


#【案例4-11】重建考勤打卡数据集行索引
import pandas as pd
df = pd.read_excel("D:/DataAnalysis/Chapter04Data/DailyChange.xlsx")
#将行索引设置为“人员编号”列
df.set_index("人员编号", inplace = True)
print(df.head(2))
#删除索引列“人员编号”
df.reset_index(drop = True, inplace = True)
print(df.head(3))

#【案例4-12】保险公司调查问卷信息数据预处理案例
import pandas as pd
df = pd.read_excel("D:/DataAnalysis/Chapter04Data/Insurance.xlsx")
print(df.head(6))
#“性别”列缺失值替换为Male
df["性别"] = df["性别"].fillna("Male")
#“工作年限”列缺失值替换为该列平均值
df["工作年限"] = df["工作年限"].fillna(df["工作年限"].mean())
print(df.head(6))

#删除数据中的重复值
df.drop_duplicates(inplace = True)
print(df.head(6))

#查看数据的统计信息
print(df.describe())

#替换“周工作时长”的异常数据
df.replace(df[df["周工作时长"]>100]["周工作时长"].tolist(), 40, inplace=True)
print(df.head(6))

print(df.info())

#修改“手机号”、“出生日期”和“工作年限”列数据类型
df["手机号"] = df["手机号"].astype(str)
df["出生日期"] = df["出生日期"].astype("datetime64")
df["工作年限"] = df["工作年限"].astype("int64")
print(df.info())

#将“手机号”列设定为索引列
df.set_index("手机号", inplace = True)
print(df.head(2))


